Import required libraries
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("stringr")) install.packages("stringr")
if (!require("ggrepel")) install.packages("ggrepel")
if (!require("dplyr")) install.packages("dplyr")
if (!require("patchwork")) install.packages("patchwork")
if (!require("BiocManager")) install.packages("BiocManager")
if (!require("ggbreak")) install.packages("ggbreak")
if (!require("clusterProfiler")) BiocManager::install("clusterProfiler")
if (!require("AnnotationDbi")) BiocManager::install("AnnotationDbi")
if (!require("org.Hs.eg.db")) BiocManager::install("org.Hs.eg.db")
if (!require("org.Hs.eg.db")) BiocManager::install("DESeq2")
if (!require("edgeR")) BiocManager::install("edgeR")
library(ggplot2)
library(stringr)
library(ggrepel)
library(dplyr)
library(patchwork)
library(clusterProfiler)
library(AnnotationDbi)
library(org.Hs.eg.db)
library(DESeq2)
library(plotly)
library(ggbreak)
library(edgeR)
CoV_mean <- function(arg_exp){
mean_values_ <- rowMeans(arg_exp)
CoV_values_ <- apply(arg_exp, 1, function(row) sd(row) / mean(row))
stats_Experiment <- cbind(mean_values_, CoV_values_)
colnames(stats_Experiment) <- c("Mean_expression",
"CoV")
return(as.data.frame(stats_Experiment))
}
plot_CoV_mean <- function(arg_dataframe, arg_x, arg_y,
arg_lab_title = "title", arg_lab_x = "x", arg_lab_y = "y",
arg_color = "blue"){
ggplot(arg_dataframe, aes_string(x = arg_x, y = arg_y, label=c("Gene"))) +
geom_point(color = arg_color, alpha = 0.4) +
labs(x = arg_lab_x, y = arg_lab_y, title = arg_lab_title)+
theme_light()+
theme(plot.title = element_text(size = 12, hjust=0.5))+
xlim(2.5, 15)+
ylim(-0.01, 0.5)
}
Import dataset with gene counts
genes_list <- read.delim(file.path(project_dir, "GSE107593_raw_reads_BCHRNAseq.txt"), check.names=F)
colnames(genes_list) <- c("ENSG_symbol", colnames(genes_list[2:length(colnames(genes_list))]))
genes_list[,c(1,3,10:ncol(genes_list))]
# Read GSE107593_series_matrix.txt
metadata_matrix <- read.delim(file.path(project_dir, "Dataset copy, untouched", "GSE107593_series_matrix.txt"), check.names=F, skip = 25)
metadata_matrix_2 <- data.frame(t(metadata_matrix[,2:ncol(metadata_matrix)]))
colnames(metadata_matrix_2) <- metadata_matrix[,1]
# Obtain patient ids
patient_ids <- unique(str_replace(metadata_matrix_2[,12], "subject: ",""))
for (el in patient_ids){
cat(paste0(el,"\n"))
}
157
877
1057
1077
1192
1214
8854
8855
8874
8878
8879
8881
# Obtain sampling locations
location <- unique(str_replace(metadata_matrix_2[,11], "location: ",""))
for (el in sort(location)){
cat(paste0(el,"\n"))
}
Ascending
Descending
Large
Rectum
Sigmoid
Tranverse
# Create df for normalization:
genes_list_lognorm <- data.frame(matrix(NA, nrow = nrow(genes_list), ncol = ncol(genes_list)))
# Columns before counts
for (col in 1:9){
genes_list_lognorm[[col]] <- genes_list[[col]]
}
# Column names
colnames(genes_list_lognorm) <- colnames(genes_list)
for (i in 10:ncol(genes_list)) {
genes_list_lognorm[, i] <- log2((genes_list[, i] / colSums(genes_list[i]) * 1000000)+8)
}
genes_list_lognorm[,c(1,3,10:ncol(genes_list))]
df_mean_vs_CoV <- genes_list_lognorm[,10:ncol(genes_list)]
stats_ <- CoV_mean(df_mean_vs_CoV)
stats_$ENSG <- genes_list_lognorm[[1]]
stats_$Gene <- genes_list_lognorm[[3]]
p1 <- plot_CoV_mean(stats_, arg_x = "Mean_expression", arg_y = "CoV", arg_lab_title = "Mean vs CoV, all samples", arg_lab_x = "Mean expression across samples", arg_lab_y = "Coefficient of Variation")
Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
generated.
p1
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).
most_variable_genes <- (stats_ %>% arrange(desc(CoV)))[1:500,]
p2 <- plot_CoV_mean(most_variable_genes, arg_x = "Mean_expression", arg_y = "CoV", arg_lab_title = "Mean vs CoV, all samples, 500 most variable genes", arg_lab_x = "Mean expression across samples", arg_lab_y = "Coefficient of Variation")
p1plotly <- ggplotly(p2)
p1plotly
# Create dataframe for PCA of lognormalized samples
genes_list_lognorm_for_pca <- t(genes_list_lognorm[,10:ncol(genes_list_lognorm)])
colnames(genes_list_lognorm_for_pca) <- rownames(genes_list_lognorm)
# PCA of lognorm counts
pca_lognorm <- prcomp(genes_list_lognorm_for_pca)
# Screeplot
pca_lognorm_metrics <- as.data.frame(t(summary(pca_lognorm)$importance))
colnames(pca_lognorm_metrics) <- c("Standard_deviation", "Proportion_of_Variance", "Cumulative_Proportion")
pca_lognorm_metrics$PCs <- as.numeric(substr(rownames(pca_lognorm_metrics), 3, length(rownames(pca_lognorm_metrics))))
ggplot(data = pca_lognorm_metrics[1:10,], aes(x=PCs, y=Proportion_of_Variance)) +
geom_line(color="blue") +
geom_point() +
scale_x_continuous(breaks = c(1:10)) +
labs(title="PCA of lognormalized counts - Variance explained per principal component")+
xlab("Principal component")+
ylab("Proportion of variance explained")+
theme_light()+
theme(plot.title = element_text(size=12),
axis.title.x = element_text(size = 11),
axis.title.y = element_text(size = 11),
text = element_text(size=11),
aspect.ratio = 1/1.5)
# Create dataframe to contain relevant metadata in appropriate format
df_samples_treatments <- data.frame(row.names = rownames(metadata_matrix_2))
# Get sample name and inflammation status
df_samples_treatments$sample_inflamation <- ""
for (row in (1:nrow(metadata_matrix_2))){
df_samples_treatments[row, ncol(df_samples_treatments)] <- str_split(
rownames(df_samples_treatments)[row],
"Ulcerative colitis Colon Biopsy ",
simplify = T)[2]}
# Get sample name
df_samples_treatments$sample <- ""
for (row in (1:nrow(df_samples_treatments))){
df_samples_treatments[row, ncol(df_samples_treatments)] <- gsub(" [^ ]*$", "", df_samples_treatments[row, ncol(df_samples_treatments)-1])
}
# Get inflammation status
df_samples_treatments$inflammation <- ""
for (row in (1:nrow(df_samples_treatments))){
df_samples_treatments[row, ncol(df_samples_treatments)] <- gsub(".* ", "", df_samples_treatments[row, ncol(df_samples_treatments)-2])
}
# Get sample location
df_samples_treatments$location <- ""
for (row in (1:nrow(df_samples_treatments))){
df_samples_treatments[row, "location"] <- gsub("location: ", "", metadata_matrix_2[row, 11])
}
# Get sampling hospital
df_samples_treatments$hospital <- ""
for (row in (1:nrow(df_samples_treatments))){
df_samples_treatments[row, "hospital"] <- gsub("site: ", "", metadata_matrix_2[row, 10])
}
# Create pca scores dataframe for lognormalized counts pca
pca_scores_lognorm <- data.frame(pca_lognorm$x[,1:10])
pca_scores_lognorm$patient <- ""
for (row in (1:nrow(pca_scores_lognorm))){
for (id_patient in 1:length(patient_ids)){
if(!is.na(str_extract(rownames(pca_scores_lognorm[row,]), patient_ids[id_patient]))){
pca_scores_lognorm[row, ncol(pca_scores_lognorm)] <- str_extract(rownames(pca_scores_lognorm[row,]), patient_ids[id_patient])
}
}
}
# Get the sample id in the PCA scores dataframe
pca_scores_lognorm$sample <- rownames(pca_scores_lognorm)
# Get the inflammation status of each sample
pca_scores_lognorm$inflammation <- ""
for(row_scores in 1:nrow(pca_scores_lognorm)){
for(row_df in 1:nrow(df_samples_treatments)){
if(pca_scores_lognorm[row_scores, "sample"] == df_samples_treatments[row_df, "sample"]){
pca_scores_lognorm[row_scores, "inflammation"] <- df_samples_treatments[row_df, "inflammation"]
}
}
}
# Get the sample location in the PCA scores dataframe
pca_scores_lognorm$location <- ""
for(row_scores in 1:nrow(pca_scores_lognorm)){
for(row_df in 1:nrow(df_samples_treatments)){
if(pca_scores_lognorm[row_scores, "sample"] == df_samples_treatments[row_df, "sample"]){
pca_scores_lognorm[row_scores, "location"] <- df_samples_treatments[row_df, "location"]
}
}
}
# Get the sampling hospital in the PCA scores dataframe
pca_scores_lognorm$hospital <- ""
for(row_scores in 1:nrow(pca_scores_lognorm)){
for(row_df in 1:nrow(df_samples_treatments)){
if(pca_scores_lognorm[row_scores, "sample"] == df_samples_treatments[row_df, "sample"]){
pca_scores_lognorm[row_scores, "hospital"] <- df_samples_treatments[row_df, "hospital"]
}
}
}
p1 <- ggplot(pca_scores_lognorm, aes(x=PC1, y=PC2, colour = inflammation)) +
geom_point(size=2)+
labs(title = "Inflammation status")+
theme_light()+
theme(legend.position = "bottom",
plot.title = element_text(hjust = 0.5),
aspect.ratio = 1)+
guides(colour=guide_legend(ncol=1))
p2 <- ggplot(pca_scores_lognorm, aes(x=PC1, y=PC2, colour = location)) +
geom_point(size=2)+
labs(title = "Sample location")+
theme_light()+
theme(legend.position = "bottom",
plot.title = element_text(hjust = 0.5),
aspect.ratio = 1)+
guides(colour=guide_legend(ncol=2))
p1+p2+plot_annotation(title = "PCA scores according to sample metadata - inflammation status and sample location",
subtitle = "Counts log-normalized",
theme = theme(plot.title = element_text(hjust = 0.5, vjust = 3),
plot.subtitle = element_text(hjust = 0.5, vjust = 3)))
ggplot(pca_scores_lognorm, aes(x=PC1, y=PC2, colour = patient, shape = inflammation, label=patient)) +
geom_point(size=3)+
geom_text_repel(nudge_x = 2)+
labs(title = "PCA scores by patient ID and inflammation status")+
theme_light()+
theme(plot.title = element_text(hjust=0.5))
# Create dataframe for PCA of lognormalized samples using 500 most variable genes
genes_list_lognorm_most_var_for_pca <- t((genes_list_lognorm %>% filter(ENSG_symbol %in% most_variable_genes$ENSG))[,10:ncol(genes_list_lognorm)])
colnames(genes_list_lognorm_most_var_for_pca) <- most_variable_genes$ENSG
# PCA of lognorm counts
pca_lognorm_most_var <- prcomp(genes_list_lognorm_most_var_for_pca)
pca_scores_lognorm_most_var_metrics <- data.frame(pca_lognorm_most_var$x[,1:10])
pca_scores_lognorm_most_var_metrics$patient <- pca_scores_lognorm$patient
pca_scores_lognorm_most_var_metrics$sample <- pca_scores_lognorm$sample
pca_scores_lognorm_most_var_metrics$inflammation <- pca_scores_lognorm$inflammation
pca_scores_lognorm_most_var_metrics$location <- pca_scores_lognorm$location
pca_scores_lognorm_most_var_metrics$hospital <- pca_scores_lognorm$hospital
p1 <- ggplot(pca_scores_lognorm_most_var_metrics, aes(x=PC1, y=PC2, colour = inflammation)) +
geom_point(size=2)+
labs(title = "Inflammation status")+
theme_light()+
theme(legend.position = "bottom",
plot.title = element_text(hjust = 0.5),
aspect.ratio = 1)+
guides(colour=guide_legend(ncol=1))
p2 <- ggplot(pca_scores_lognorm_most_var_metrics, aes(x=PC1, y=PC2, colour = location)) +
geom_point(size=2)+
labs(title = "Sample location")+
theme_light()+
theme(legend.position = "bottom",
plot.title = element_text(hjust = 0.5),
aspect.ratio = 1)+
guides(colour=guide_legend(ncol=2))
p1+p2+plot_annotation(title = "PCA scores according to sample metadata - inflammation status and sample location",
subtitle = "Counts log-normalized, PCA over 500 Most variable genes",
theme = theme(plot.title = element_text(hjust = 0.5, vjust = 3),
plot.subtitle = element_text(hjust = 0.5, vjust = 3)))
# Extract the rotations for pcas with lognormalized counts, all genes
pca_rotations <- data.frame(pca_lognorm$rotation[,1:10])
pca_lognorm_rotations <- data.frame(pca_lognorm$rotation[,1:10])
# Add column with gene names to rotations dataframes
pca_rotations$gene_name <- genes_list$gene_name
pca_lognorm_rotations$gene_name <- genes_list$gene_name
# Show genes with higher rotation values for pca (raw counts)
top_pos_pca_lognorm <- pca_lognorm_rotations %>% arrange(desc(PC1)) %>% dplyr::select(c(PC1, gene_name))
top20_pos_pca_lognorm <- top_pos_pca_lognorm[1:20,]
top20_pos_pca_lognorm$gene_name
[1] "IGHG1" "DUOX2" "IGHG3" "PI3" "DUOXA2" "IGHG2"
[7] "LCN2" "CHI3L1" "CXCL1" "IGHG4" "NOS2" "REG1A"
[13] "SLC6A14" "MMP3" "C3" "DMBT1" "IGHV4-34" "REG4"
[19] "PLA2G2A" "IGFBP5"
GO_PC1_pos <- enrichGO(gene = top20_pos_pca_lognorm$gene_name, OrgDb = "org.Hs.eg.db", keyType = "SYMBOL", ont = "BP")
temp_GO_PC1_pos <- as.data.frame(GO_PC1_pos)[,c("ID", "Description", "GeneRatio", "p.adjust")]
temp_GO_PC1_pos$Description <- strtrim(temp_GO_PC1_pos$Description, 50)
rownames(temp_GO_PC1_pos) <- 1:nrow(temp_GO_PC1_pos)
temp_GO_PC1_pos
# Show genes with more negative rotation values for pca (raw counts)
top_neg_pca_lognorm <- pca_lognorm_rotations %>% arrange(PC1) %>% dplyr::select(c(PC1, gene_name))
top20_neg_pca_lognorm <- top_neg_pca_lognorm[1:20,]
top20_neg_pca_lognorm$gene_name
[1] "AQP8" "HMGCS2" "SLC26A2" "CA1" "ANPEP" "PCK1"
[7] "CHP2" "B4GALNT2" "GUCA2A" "ADH1C" "PADI2" "SLC26A3"
[13] "ABCG2" "FABP1" "CA2" "LYPD8" "CKB" "SLC51A"
[19] "UGT2A3" "SLC9A3"
GO_PC1_neg <- enrichGO(gene = top20_neg_pca_lognorm$gene_name, OrgDb = "org.Hs.eg.db", keyType = "SYMBOL", ont = "BP")
temp_GO_PC1_neg <- as.data.frame(GO_PC1_neg)[,c("ID", "Description", "GeneRatio", "p.adjust")]
rownames(temp_GO_PC1_neg) <- 1:nrow(temp_GO_PC1_neg)
temp_GO_PC1_neg
cts <- as.matrix(genes_list[,c(10:ncol(genes_list))])
rownames(cts) <- genes_list[[3]]
colData <- pca_scores_lognorm[,c(11, 13:15)]
colData$inflammation <- factor(colData$inflammation, levels = c("Non-Inflamed", "Inflamed"))
colData$location <- factor(colData$location)
colData$hospital <- factor(colData$hospital)
cts <- cts[, rownames(colData)]
as.data.frame(cts)
colData
dds <- suppressMessages(DESeqDataSetFromMatrix(countData = cts,
colData = colData,
design = ~ inflammation))
smallestGroupSize <- 3
keep <- rowSums(counts(dds) >= 10) >= smallestGroupSize
dds <- dds[keep,]
significance_level <- 0.001
dds <- suppressMessages(DESeq(dds))
res <- results(dds, alpha = significance_level)
gene_list <- as.data.frame(res)
gene_list$Gene <- rownames(gene_list)
gene_list %>% filter(padj<significance_level) %>% arrange(padj)
plotMA(res)
ggplot(gene_list, aes(x=pvalue))+
geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(gene_list, aes(x=padj))+
geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
gene_list$significance <- NA
for (gene in 1:nrow(gene_list)){
if (gene_list[gene,"log2FoldChange"]>=1 & gene_list[gene,"padj"]< significance_level){
gene_list[gene, "significance"] <- "Up"
}
else if (gene_list[gene,"log2FoldChange"]<=(-1) & gene_list[gene,"padj"]< significance_level){
gene_list[gene, "significance"] <- "Down"
}
else if (abs(gene_list[gene,"log2FoldChange"])<1 | gene_list[gene,"padj"]>= significance_level){
gene_list[gene, "significance"] <- "Not significant"
}
}
gene_list_sig_up_outlier <- gene_list %>% filter(log2FoldChange > 20)
# Get upregulated genes
dge_up <- gene_list %>% filter(significance == "Up")
# Get downregulated genes
dge_down <- gene_list %>% filter(significance == "Down")
GO_dge_up <- enrichGO(gene = dge_up$Gene, universe = gene_list$Gene, OrgDb = "org.Hs.eg.db", keyType = "SYMBOL", ont = "BP")
temp_GO_dge_up <- as.data.frame(GO_dge_up)[,c("ID", "Description", "GeneRatio", "p.adjust")]
temp_GO_dge_up$Description <- strtrim(temp_GO_dge_up$Description, 50)
rownames(temp_GO_dge_up) <- 1:nrow(temp_GO_dge_up)
temp_GO_dge_up
GO_dge_down <- enrichGO(gene = dge_down$Gene, universe = gene_list$Gene, OrgDb = "org.Hs.eg.db", keyType = "SYMBOL", ont = "BP")
temp_GO_dge_down <- as.data.frame(GO_dge_down)[,c("ID", "Description", "GeneRatio", "p.adjust")]
temp_GO_dge_down$Description <- strtrim(temp_GO_dge_down$Description, 50)
rownames(temp_GO_dge_down) <- 1:nrow(temp_GO_dge_down)
temp_GO_dge_down
Construct edgeR DGElist object
cts_edgeR <- cts
inflammation_status <- as.factor(colData$inflammation)
inflammation_status <- relevel(inflammation_status, "Non-Inflamed")
dge <- DGEList(counts = cts_edgeR, group = inflammation_status)
Filter low-expressed genes
keep <- filterByExpr(y=dge)
dge <- dge[keep, , keep.lib.sizes=F]
Normalization
dge <- calcNormFactors(dge)
dge$samples
Estimate dispersion
dge <- estimateDisp(y = dge)
Using classic mode.
diff_expr <- exactTest(dge)
top_diff_expr <- topTags(diff_expr, n="Inf")
gene_list_edgeR <- as.data.frame(top_diff_expr)
gene_list_edgeR
gene_list_edgeR$Gene <- rownames(gene_list_edgeR)
gene_list_edgeR$significance <- NA
for (gene in 1:nrow(gene_list_edgeR)){
if (gene_list_edgeR[gene,"logFC"]>=1 & gene_list_edgeR[gene,"FDR"]< significance_level){
gene_list_edgeR[gene, "significance"] <- "Up"
}
else if (gene_list_edgeR[gene,"logFC"]<=(-1) & gene_list_edgeR[gene,"FDR"]< significance_level){
gene_list_edgeR[gene, "significance"] <- "Down"
}
else if (abs(gene_list_edgeR[gene,"logFC"])<1 | gene_list_edgeR[gene,"FDR"]>= significance_level){
gene_list_edgeR[gene, "significance"] <- "Not significant"
}
}
# Get upregulated genes
dge_edgeR_up <- gene_list_edgeR %>% filter(significance == "Up")
# Get downregulated genes
dge_edgeR_down <- gene_list_edgeR %>% filter(significance == "Down")
dge_edgeR_up <- enrichGO(gene = dge_edgeR_up$Gene, universe = gene_list_edgeR$Gene, OrgDb = "org.Hs.eg.db", keyType = "SYMBOL", ont = "BP")
temp_GO_dge_edgeR_up <- as.data.frame(dge_edgeR_up)[,c("ID", "Description", "GeneRatio", "p.adjust")]
temp_GO_dge_edgeR_up$Description <- strtrim(temp_GO_dge_edgeR_up$Description, 50)
rownames(temp_GO_dge_edgeR_up) <- 1:nrow(temp_GO_dge_edgeR_up)
temp_GO_dge_edgeR_up
GO_dge_edgeR_down <- enrichGO(gene = dge_edgeR_down$Gene, universe = gene_list_edgeR$Gene, OrgDb = "org.Hs.eg.db", keyType = "SYMBOL", ont = "BP")
temp_GO_dge_edgeR_down <- as.data.frame(GO_dge_edgeR_down)[,c("ID", "Description", "GeneRatio", "p.adjust")]
temp_GO_dge_edgeR_down$Description <- strtrim(temp_GO_dge_edgeR_down$Description, 50)
rownames(temp_GO_dge_edgeR_down) <- 1:nrow(temp_GO_dge_edgeR_down)
temp_GO_dge_edgeR_down